library(dplyr)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
histologies_df <- read_tsv(file.path("..", "..", "data", 
                                     "pbta-histologies.tsv"))
Parsed with column specification:
cols(
  .default = col_character(),
  OS_days = col_double(),
  age_last_update_days = col_double(),
  normal_fraction = col_double(),
  tumor_fraction = col_double(),
  tumor_ploidy = col_double(),
  molecular_subtype = col_logical()
)
See spec(...) for full column specifications.
221 parsing failures.
 row               col           expected actual                              file
2606 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2607 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv'
2608 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2609 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2610 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
.... ................. .................. ...... .................................
See problems(...) for more details.
chordoma_samples <- histologies_df %>%
  filter(short_histology == "Chordoma") %>% 
  pull(Kids_First_Biospecimen_ID)
# TODO: update to use consensus file and likely a more permissive version of 
# the focal-cn-file-preparation annotation step
# Here, we're using an older version of the annotated files that used exons
focal_cn_df <- read_tsv("https://github.com/AlexsLemonade/OpenPBTA-analysis/raw/fa214291713575be7fd20c92374b268870f4173f/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz")
Parsed with column specification:
cols(
  biospecimen_id = col_character(),
  status = col_character(),
  copy_number = col_double(),
  ploidy = col_double(),
  ensembl = col_character(),
  gene_symbol = col_character(),
  cytoband = col_character()
)
chordoma_loss <- focal_cn_df %>% 
  filter(biospecimen_id %in% chordoma_samples, gene_symbol == "SMARCB1")

chordoma_loss 
#we need to include the sample_id field from pbta-histologies.tsv in the final table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression values) and WGS data (e.g., SMARCB1 focal copy #number status) from the same event for a given individual).
#To get the SMARCB1 jitter plot in the photo here #250 (comment), you will first need to read in the #collapsed expression data

expression_data <- read_rds(file.path("..", "..", "data", "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
expression_data
chordoma_id_df <- histologies_df %>% 
  # only rows with chordoma samples
  filter(short_histology == "Chordoma") %>%
  # select only these columns that we'll need later
  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
         experimental_strategy)
chordoma_id_df
copy_neutral_df <- chordoma_id_df %>% 
  # the copy events can only be taken from WGS data not RNA-seq data
  # we also only want biospecimens where a loss was not recorded to avoid duplicates
  filter(experimental_strategy == "WGS",
         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
  # if there's no loss, let's assume status is copy neutral
  mutate(status = "neutral") %>%
  # let's get the columns to match chordoma_loss
  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
  select(biospecimen_id, status)
copy_neutral_df
chordoma_copy <- chordoma_loss %>% 
  #join the losses with the neutrals to get a new data frame
  select(biospecimen_id, status) %>%
  bind_rows(copy_neutral_df)
chordoma_copy

Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy so we can match WGS and RNA-seq biospecimens from the same event/sample:

chordoma_copy <- chordoma_copy %>%
  # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
  # then use biospecimen IDs to add the sample_id info
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
chordoma_copy

look at SMARCB1 expression values only in chordoma

# get the row that contains the SMARCB1 values
# gene symbols are rownames
smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]

# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
smarcb1_expression

The smarcb1_expression is a not a friendly form ^^; Transposing needed:

# transpose such that samples are rows
smarcb1_expression <- t(smarcb1_expression) %>%
  # make a data.frame
  as.data.frame() %>%
  # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  # give SMARCB1 column a slightly better column name
  rename(SMARCB1_expression = SMARCB1)
smarcb1_expression

This also needs sample_id to add it in:

smarcb1_expression <- smarcb1_expression %>%
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = "Kids_First_Biospecimen_ID")
smarcb1_expression
chordoma_smarcb1_df <- smarcb1_expression %>%
  # any missing samples will get filled with NA when using a full join
  full_join(chordoma_copy, by = "sample_id")
chordoma_smarcb1_df
# this step adds in the participant identifier (sample_id to match between the two data.frame)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  inner_join(distinct(select(chordoma_id_df, 
                             sample_id, 
                             Kids_First_Participant_ID)),
             by = "sample_id")

# combining the two biospecimen identifiers to a single column (all biospecimen IDs for a sampl separated by a comma)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  mutate(Kids_First_Biospecimen_ID = if_else(
    # one sample is missing WGS data, so if that's true only include biospecimen ID from RNA-seq
    is.na(biospecimen_id),
    Kids_First_Biospecimen_ID,
    paste(Kids_First_Biospecimen_ID, biospecimen_id, sep = ", ")
  ))
chordoma_smarcb1_df
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  select(Kids_First_Participant_ID, 
         Kids_First_Biospecimen_ID, 
         sample_id,
         status,
         SMARCB1_expression) %>%
  # 'status' is replaced a more descriptive name
  rename(focal_SMARCB1_status = status)
chordoma_smarcb1_df
# this specifies that this is the data we want to plot
chordoma_smarcb1_df %>%
  # drop the sample that doesn't have WGS data
  tidyr::drop_na() %>%
  # this step specifies what should go on the x- and y-axes
  ggplot(aes(x = focal_SMARCB1_status,
             y = SMARCB1_expression)) +
  # we want a jitter plot where the points aren't too far 
  # apart that's what width does
  geom_jitter(width = 0.1) +
  # this is plotting the median as a blue diamond
  stat_summary(fun.y = "median", 
               geom = "point", 
               size = 3, 
               color = "blue", 
               shape = 18)

Brachyury / T / TBXT protein expression

---
title: "Subtyping chordoma"
output: html_notebook
author: Mateusz Koptyra 
date: 20191121
---

```{r}
library(dplyr)
library(readr)
library(ggplot2)
```

```{r}
histologies_df <- read_tsv(file.path("..", "..", "data", 
                                     "pbta-histologies.tsv"))
```
```{r}
chordoma_samples <- histologies_df %>%
  filter(short_histology == "Chordoma") %>% 
  pull(Kids_First_Biospecimen_ID)
```

```{r}
# TODO: update to use consensus file and likely a more permissive version of 
# the focal-cn-file-preparation annotation step
# Here, we're using an older version of the annotated files that used exons
focal_cn_df <- read_tsv("https://github.com/AlexsLemonade/OpenPBTA-analysis/raw/fa214291713575be7fd20c92374b268870f4173f/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz")
```

```{r}
chordoma_loss <- focal_cn_df %>% 
  filter(biospecimen_id %in% chordoma_samples, gene_symbol == "SMARCB1")

chordoma_loss 
```

```{r}
#we need to include the sample_id field from pbta-histologies.tsv in the final table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression values) and WGS data (e.g., SMARCB1 focal copy #number status) from the same event for a given individual).
#To get the SMARCB1 jitter plot in the photo here #250 (comment), you will first need to read in the #collapsed expression data

expression_data <- read_rds(file.path("..", "..", "data", "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
expression_data
```

```{r}
chordoma_id_df <- histologies_df %>% 
  # only rows with chordoma samples
  filter(short_histology == "Chordoma") %>%
  # select only these columns that we'll need later
  select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
         experimental_strategy)
chordoma_id_df
```

```{r}
copy_neutral_df <- chordoma_id_df %>% 
  # the copy events can only be taken from WGS data not RNA-seq data
  # we also only want biospecimens where a loss was not recorded to avoid duplicates
  filter(experimental_strategy == "WGS",
         !(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
  # if there's no loss, let's assume status is copy neutral
  mutate(status = "neutral") %>%
  # let's get the columns to match chordoma_loss
  rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
  select(biospecimen_id, status)
copy_neutral_df
```

```{r}
chordoma_copy <- chordoma_loss %>% 
  #join the losses with the neutrals to get a new data frame
  select(biospecimen_id, status) %>%
  bind_rows(copy_neutral_df)
chordoma_copy
```
Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy so we can match WGS and RNA-seq biospecimens from the same event/sample:
```{r}
chordoma_copy <- chordoma_copy %>%
  # get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
  # then use biospecimen IDs to add the sample_id info
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
chordoma_copy
```
look at SMARCB1 expression values only in chordoma

```{r}
# get the row that contains the SMARCB1 values
# gene symbols are rownames
smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]

# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
smarcb1_expression
```

The smarcb1_expression is a not a friendly form ^^; Transposing needed: 
```{r}
# transpose such that samples are rows
smarcb1_expression <- t(smarcb1_expression) %>%
  # make a data.frame
  as.data.frame() %>%
  # we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
  tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
  # give SMARCB1 column a slightly better column name
  rename(SMARCB1_expression = SMARCB1)
smarcb1_expression
```
This also needs sample_id to add it in:

```{r}
smarcb1_expression <- smarcb1_expression %>%
  inner_join(select(chordoma_id_df,
                    Kids_First_Biospecimen_ID,
                    sample_id),
             by = "Kids_First_Biospecimen_ID")
smarcb1_expression
```

```{r}
chordoma_smarcb1_df <- smarcb1_expression %>%
  # any missing samples will get filled with NA when using a full join
  full_join(chordoma_copy, by = "sample_id")
chordoma_smarcb1_df
```

```{r}
# this step adds in the participant identifier (sample_id to match between the two data.frame)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  inner_join(distinct(select(chordoma_id_df, 
                             sample_id, 
                             Kids_First_Participant_ID)),
             by = "sample_id")

# combining the two biospecimen identifiers to a single column (all biospecimen IDs for a sampl separated by a comma)
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  mutate(Kids_First_Biospecimen_ID = if_else(
    # one sample is missing WGS data, so if that's true only include biospecimen ID from RNA-seq
    is.na(biospecimen_id),
    Kids_First_Biospecimen_ID,
    paste(Kids_First_Biospecimen_ID, biospecimen_id, sep = ", ")
  ))
chordoma_smarcb1_df
```

```{r}
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
  select(Kids_First_Participant_ID, 
         Kids_First_Biospecimen_ID, 
         sample_id,
         status,
         SMARCB1_expression) %>%
  # 'status' is replaced a more descriptive name
  rename(focal_SMARCB1_status = status)
chordoma_smarcb1_df
```

```{r}
# this specifies that this is the data we want to plot
chordoma_smarcb1_df %>%
  # drop the sample that doesn't have WGS data
  tidyr::drop_na() %>%
  # this step specifies what should go on the x- and y-axes
  ggplot(aes(x = focal_SMARCB1_status,
             y = SMARCB1_expression)) +
  # we want a jitter plot where the points aren't too far 
  # apart that's what width does
  geom_jitter(width = 0.1) +
  # this is plotting the median as a blue diamond
  stat_summary(fun.y = "median", 
               geom = "point", 
               size = 3, 
               color = "blue", 
               shape = 18)
```

## Brachyury / T / TBXT protein expression
